In [1]:
import warnings
warnings.filterwarnings('ignore')

Linear Regression

It is a basic and commonly used type of predictive analysis. These regression estimates are used to explain the relationship between one dependent variable and one or more independent variables. Y = a + bX where

  • Y – Dependent Variable
  • a – intercept
  • X – Independent variable
  • b – Slope

Example: University GPA' = (0.675)(High School GPA) + 1.097

Library and Data

In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import classification_report, confusion_matrix
train = pd.read_csv("linear-regression/train.csv")
test = pd.read_csv("linear-regression/test.csv")
train = train.dropna()
test = test.dropna()
train.head()
Out[2]:
x y
0 24.0 21.549452
1 50.0 47.464463
2 15.0 17.218656
3 38.0 36.586398
4 87.0 87.288984
In [3]:
X_train = np.array(train.iloc[:, :-1].values)
y_train = np.array(train.iloc[:, 1].values)
X_test = np.array(test.iloc[:, :-1].values)
y_test = np.array(test.iloc[:, 1].values)
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = model.score(X_test, y_test)

plt.plot(X_train, model.predict(X_train), color='green')
plt.show()
print(accuracy)
0.9888014444327563

Logistic Regression

It’s a classification algorithm, that is used where the response variable is categorical. The idea of Logistic Regression is to find a relationship between features and probability of particular outcome.

  • odds= p(x)/(1-p(x)) = probability of event occurrence / probability of not event occurrence

Example- When we have to predict if a student passes or fails in an exam when the number of hours spent studying is given as a feature, the response variable has two values, pass and fail.

Libraries and data

In [4]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score
from statistics import mode


train = pd.read_csv("titanic/train.csv")
test  = pd.read_csv('titanic/test.csv')
train.head()
Out[4]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
In [5]:
ports = pd.get_dummies(train.Embarked , prefix='Embarked')
train = train.join(ports)
train.drop(['Embarked'], axis=1, inplace=True)
train.Sex = train.Sex.map({'male':0, 'female':1})
y = train.Survived.copy()
X = train.drop(['Survived'], axis=1)
X.drop(['Cabin'], axis=1, inplace=True)
X.drop(['Ticket'], axis=1, inplace=True)
X.drop(['Name'], axis=1, inplace=True)
X.drop(['PassengerId'], axis=1, inplace=True)
X.Age.fillna(X.Age.median(), inplace=True)

Model and Accuracy

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=5)
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter = 500000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = model.score(X_test, y_test)
print(accuracy)
0.8385650224215246

Confusion Matrix

In [7]:
confusion_matrix(y_test,y_pred)
Out[7]:
array([[127,  13],
       [ 23,  60]], dtype=int64)

Report

In [8]:
classification_report(y_test,y_pred)
Out[8]:
'              precision    recall  f1-score   support\n\n           0       0.85      0.91      0.88       140\n           1       0.82      0.72      0.77        83\n\n    accuracy                           0.84       223\n   macro avg       0.83      0.82      0.82       223\nweighted avg       0.84      0.84      0.84       223\n'

Gaussian Process Classifier

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=5)
from sklearn.gaussian_process import GaussianProcessClassifier
model = GaussianProcessClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = model.score(X_test, y_test)
print(accuracy)
0.7488789237668162

Support Vector Machine

Example: One class is linearly separable from the others like if we only had two features like Height and Hair length of an individual, we’d first plot these two variables in two dimensional space where each point has two co-ordinates

Libraries and Data

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
data_svm = pd.read_csv("svm-classification/UniversalBank.csv")
data_svm.head()
Out[10]:
ID Age Experience Income ZIP Code Family CCAvg Education Mortgage Personal Loan Securities Account CD Account Online CreditCard
0 1 25 1 49 91107 4 1.6 1 0 0 1 0 0 0
1 2 45 19 34 90089 3 1.5 1 0 0 1 0 0 0
2 3 39 15 11 94720 1 1.0 1 0 0 0 0 0 0
3 4 35 9 100 94112 1 2.7 2 0 0 0 0 0 0
4 5 35 8 45 91330 4 1.0 2 0 0 0 0 0 1

Model and Accuracy

In [11]:
X = data_svm.iloc[:,1:13].values
y = data_svm.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
accuracies.mean()
Out[11]:
0.7069352220578753

Nu Support Vector Classification

Library and Data

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.svm import NuSVC
nu_svm = pd.read_csv("svm-classification/UniversalBank.csv")
nu_svm.head()
Out[12]:
ID Age Experience Income ZIP Code Family CCAvg Education Mortgage Personal Loan Securities Account CD Account Online CreditCard
0 1 25 1 49 91107 4 1.6 1 0 0 1 0 0 0
1 2 45 19 34 90089 3 1.5 1 0 0 1 0 0 0
2 3 39 15 11 94720 1 1.0 1 0 0 0 0 0 0
3 4 35 9 100 94112 1 2.7 2 0 0 0 0 0 0
4 5 35 8 45 91330 4 1.0 2 0 0 0 0 0 1

Model and Accuracy

In [13]:
X = nu_svm.iloc[:,1:13].values
y = nu_svm.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
classifier = NuSVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
accuracies.mean()
Out[13]:
0.7008039784579209

Naive Bayes Algorithm

A naive Bayes classifier is not a single algorithm, but a family of machine learning algorithms which use probability theory to classify data with an assumption of independence between predictors It is easy to build and particularly useful for very large data sets. Along with simplicity, Naive Bayes is known to outperform even highly sophisticated classification methods

Example: Emails are given and we have to find the spam emails from that.A spam filter looks at email messages for certain key words and puts them in a spam folder if they match.

Libraries and Data

In [14]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
data = pd.read_csv('classification-suv-dataset/Social_Network_Ads.csv')
data_nb = data
data_nb.head()
Out[14]:
User ID Gender Age EstimatedSalary Purchased
0 15624510 Male 19 19000 0
1 15810944 Male 35 20000 0
2 15668575 Female 26 43000 0
3 15603246 Female 27 57000 0
4 15804002 Male 19 76000 0

Gaussian NB

In [15]:
X = data_nb.iloc[:, [2,3]].values
y = data_nb.iloc[:, 4].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
classifier=GaussianNB()
classifier.fit(X_train,y_train)
y_pred=classifier.predict(X_test)
acc=accuracy_score(y_test, y_pred)
print(acc)
0.9125

Bernoulli NB

In [16]:
X = data_nb.iloc[:, [2,3]].values
y = data_nb.iloc[:, 4].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
classifier=BernoulliNB()
classifier.fit(X_train,y_train)
y_pred=classifier.predict(X_test)
acc=accuracy_score(y_test, y_pred)
print(acc)
0.825

KNN

KNN does not learn any model. and stores the entire training data set which it uses as its representation.The output can be calculated as the class with the highest frequency from the K-most similar instances. Each instance in essence votes for their class and the class with the most votes is taken as the prediction

Example: Should the bank give a loan to an individual? Would an individual default on his or her loan? Is that person closer in characteristics to people who defaulted or did not default on their loans?

Libraries and Data

In [17]:
from sklearn.neighbors import KNeighborsClassifier
knn = pd.read_csv("iris/Iris.csv")
knn.head()
Out[17]:
Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
0 1 5.1 3.5 1.4 0.2 Iris-setosa
1 2 4.9 3.0 1.4 0.2 Iris-setosa
2 3 4.7 3.2 1.3 0.2 Iris-setosa
3 4 4.6 3.1 1.5 0.2 Iris-setosa
4 5 5.0 3.6 1.4 0.2 Iris-setosa

Model and Accuracy

In [18]:
X = knn.iloc[:, [1,2,3,4]].values
y = knn.iloc[:, 5].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
classifier=KNeighborsClassifier(n_neighbors=5,metric='minkowski',p=2)
classifier.fit(X_train,y_train)
y_pred=classifier.predict(X_test)
acc=accuracy_score(y_test, y_pred)
print(acc)
1.0

Perceptron

It is single layer neural network and used for classification

In [19]:
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
p = pd.read_csv("iris/Iris.csv")
p.head()
Out[19]:
Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
0 1 5.1 3.5 1.4 0.2 Iris-setosa
1 2 4.9 3.0 1.4 0.2 Iris-setosa
2 3 4.7 3.2 1.3 0.2 Iris-setosa
3 4 4.6 3.1 1.5 0.2 Iris-setosa
4 5 5.0 3.6 1.4 0.2 Iris-setosa
In [20]:
X = p.iloc[:, [1,2,3,4]].values
y = p.iloc[:, 5].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
classifier=Perceptron()
classifier.fit(X_train,y_train)
y_pred=classifier.predict(X_test)
acc=accuracy_score(y_test, y_pred)
print(acc)
0.9666666666666667

Random Forest

Random forest is collection of tress(forest) and it builds multiple decision trees and merges them together to get a more accurate and stable prediction.It can be used for both classification and regression problems.

Example: Suppose we have a bowl of 100 unique numbers from 0 to 99. We want to select a random sample of numbers from the bowl. If we put the number back in the bowl, it may be selected more than once.

Libraries and Data

In [21]:
from sklearn.ensemble import RandomForestClassifier
rf = pd.read_csv("mushroom-classification/mushrooms.csv")
rf.head()
Out[21]:
class cap-shape cap-surface cap-color bruises odor gill-attachment gill-spacing gill-size gill-color ... stalk-surface-below-ring stalk-color-above-ring stalk-color-below-ring veil-type veil-color ring-number ring-type spore-print-color population habitat
0 p x s n t p f c n k ... s w w p w o p k s u
1 e x s y t a f c b k ... s w w p w o p n n g
2 e b s w t l f c b n ... s w w p w o p n n m
3 p x y w t p f c n n ... s w w p w o p k s u
4 e x s g f n f w b k ... s w w p w o e n a g

5 rows × 23 columns

Model and Accuracy

In [22]:
X = rf.drop('class', axis=1)
y = rf['class']
X = pd.get_dummies(X)
y = pd.get_dummies(y)
X_train, X_test, y_train, y_test = train_test_split(X, y)
model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=1)
model.fit(X_train, y_train)
model.score(X_test, y_test)
Out[22]:
1.0

Decision Tree

Decision tree algorithm is classification algorithm under supervised machine learning and it is simple to understand and use in data.The idea of Decision tree is to split the big data(root) into smaller(leaves)

In [23]:
from sklearn.tree import DecisionTreeClassifier
dt = data
dt.head()
Out[23]:
User ID Gender Age EstimatedSalary Purchased
0 15624510 Male 19 19000 0
1 15810944 Male 35 20000 0
2 15668575 Female 26 43000 0
3 15603246 Female 27 57000 0
4 15804002 Male 19 76000 0
In [24]:
X = dt.iloc[:, [2,3]].values
y = dt.iloc[:, 4].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
classifier=DecisionTreeClassifier(criterion="entropy",random_state=0)
classifier.fit(X_train,y_train)
y_pred=classifier.predict(X_test)
acc=accuracy_score(y_test, y_pred)
print(acc)
0.9

Extra Tree

Library and Data

In [25]:
from sklearn.ensemble import ExtraTreesClassifier
et = data
et.head()
Out[25]:
User ID Gender Age EstimatedSalary Purchased
0 15624510 Male 19 19000 0
1 15810944 Male 35 20000 0
2 15668575 Female 26 43000 0
3 15603246 Female 27 57000 0
4 15804002 Male 19 76000 0

Model and Accuracy

In [26]:
X = et.iloc[:, [2,3]].values
y = et.iloc[:, 4].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
classifier=ExtraTreesClassifier(criterion="entropy",random_state=0)
classifier.fit(X_train,y_train)
y_pred=classifier.predict(X_test)
acc=accuracy_score(y_test, y_pred)
print(acc)
0.8875

AdaBoost Classifier

Library and Data

In [27]:
from sklearn.ensemble import AdaBoostClassifier
ac = data
ac.head()
Out[27]:
User ID Gender Age EstimatedSalary Purchased
0 15624510 Male 19 19000 0
1 15810944 Male 35 20000 0
2 15668575 Female 26 43000 0
3 15603246 Female 27 57000 0
4 15804002 Male 19 76000 0

Model and Accuracy

In [28]:
X = ac.iloc[:, [2,3]].values
y = ac.iloc[:, 4].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
classifier=AdaBoostClassifier(random_state=0)
classifier.fit(X_train,y_train)
y_pred=classifier.predict(X_test)
acc=accuracy_score(y_test, y_pred)
print(acc)
0.925

Passive Aggressive Classifier

Library and Data

In [29]:
from sklearn.linear_model import PassiveAggressiveClassifier
pac = data
pac.head()
Out[29]:
User ID Gender Age EstimatedSalary Purchased
0 15624510 Male 19 19000 0
1 15810944 Male 35 20000 0
2 15668575 Female 26 43000 0
3 15603246 Female 27 57000 0
4 15804002 Male 19 76000 0

Model and Accuracy

In [30]:
X = pac.iloc[:, [2,3]].values
y = pac.iloc[:, 4].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
classifier=PassiveAggressiveClassifier(random_state=0)
classifier.fit(X_train,y_train)
y_pred=classifier.predict(X_test)
acc=accuracy_score(y_test, y_pred)
print(acc)
0.9125

Bagging Classifier

Library and Data

In [31]:
from sklearn.ensemble import BaggingClassifier
bc = data
bc.head()
Out[31]:
User ID Gender Age EstimatedSalary Purchased
0 15624510 Male 19 19000 0
1 15810944 Male 35 20000 0
2 15668575 Female 26 43000 0
3 15603246 Female 27 57000 0
4 15804002 Male 19 76000 0

Model and Accuracy

In [32]:
X = bc.iloc[:, [2,3]].values
y = bc.iloc[:, 4].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
classifier=BaggingClassifier(random_state=0)
classifier.fit(X_train,y_train)
y_pred=classifier.predict(X_test)
acc=accuracy_score(y_test, y_pred)
print(acc)
0.875

Gradient Boosting

Gradient boosting is an alogithm under supervised machine learning, boosting means converting weak into strong. In this new tree is boosted over the previous tree

Library and Data

In [33]:
from sklearn.ensemble import GradientBoostingClassifier
gb = data
gb.head()
Out[33]:
User ID Gender Age EstimatedSalary Purchased
0 15624510 Male 19 19000 0
1 15810944 Male 35 20000 0
2 15668575 Female 26 43000 0
3 15603246 Female 27 57000 0
4 15804002 Male 19 76000 0

Model and Accuracy

In [34]:
X = gb.iloc[:, [2,3]].values
y = gb.iloc[:, 4].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
gbk = GradientBoostingClassifier()
gbk.fit(X_train, y_train)
pred = gbk.predict(X_test)
acc=accuracy_score(y_test, y_pred)
print(acc)
0.875

Light GBM

LightGBM is a gradient boosting framework that uses tree based learning algorithms. It is designed to be distributed and efficient with the following advantages:

  1. Faster training speed and higher efficiency.
  2. Lower memory usage.
  3. Better accuracy.
  4. Support of parallel and GPU learning.
  5. Capable of handling large-scale data.

Library and Data

In [35]:
import lightgbm as lgbm
import lightgbm as lgb
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV
from sklearn import preprocessing


train = pd.read_csv("house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("house-prices-advanced-regression-techniques/test.csv")
data = pd.concat([train, test], sort=False)
data = data.reset_index(drop=True)
data.head()
Out[35]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
0 1 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 2 2008 WD Normal 208500.0
1 2 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 5 2007 WD Normal 181500.0
2 3 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 9 2008 WD Normal 223500.0
3 4 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 2 2006 WD Abnorml 140000.0
4 5 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 12 2008 WD Normal 250000.0

5 rows × 81 columns

Preprocessing

In [36]:
nans=pd.isnull(data).sum()

data['MSZoning']  = data['MSZoning'].fillna(data['MSZoning'].mode()[0])
data['Utilities'] = data['Utilities'].fillna(data['Utilities'].mode()[0])
data['Exterior1st'] = data['Exterior1st'].fillna(data['Exterior1st'].mode()[0])
data['Exterior2nd'] = data['Exterior2nd'].fillna(data['Exterior2nd'].mode()[0])

data["BsmtFinSF1"]  = data["BsmtFinSF1"].fillna(0)
data["BsmtFinSF2"]  = data["BsmtFinSF2"].fillna(0)
data["BsmtUnfSF"]   = data["BsmtUnfSF"].fillna(0)
data["TotalBsmtSF"] = data["TotalBsmtSF"].fillna(0)
data["BsmtFullBath"] = data["BsmtFullBath"].fillna(0)
data["BsmtHalfBath"] = data["BsmtHalfBath"].fillna(0)
data["BsmtQual"] = data["BsmtQual"].fillna("None")
data["BsmtCond"] = data["BsmtCond"].fillna("None")
data["BsmtExposure"] = data["BsmtExposure"].fillna("None")
data["BsmtFinType1"] = data["BsmtFinType1"].fillna("None")
data["BsmtFinType2"] = data["BsmtFinType2"].fillna("None")

data['KitchenQual']  = data['KitchenQual'].fillna(data['KitchenQual'].mode()[0])
data["Functional"]   = data["Functional"].fillna("Typ")
data["FireplaceQu"]  = data["FireplaceQu"].fillna("None")

data["GarageType"]   = data["GarageType"].fillna("None")
data["GarageYrBlt"]  = data["GarageYrBlt"].fillna(0)
data["GarageFinish"] = data["GarageFinish"].fillna("None")
data["GarageCars"] = data["GarageCars"].fillna(0)
data["GarageArea"] = data["GarageArea"].fillna(0)
data["GarageQual"] = data["GarageQual"].fillna("None")
data["GarageCond"] = data["GarageCond"].fillna("None")

data["PoolQC"] = data["PoolQC"].fillna("None")
data["Fence"]  = data["Fence"].fillna("None")
data["MiscFeature"] = data["MiscFeature"].fillna("None")
data['SaleType']    = data['SaleType'].fillna(data['SaleType'].mode()[0])
data['LotFrontage'].interpolate(method='linear',inplace=True)
data["Electrical"]  = data.groupby("YearBuilt")['Electrical'].transform(lambda x: x.fillna(x.mode()[0]))
data["Alley"] = data["Alley"].fillna("None")

data["MasVnrType"] = data["MasVnrType"].fillna("None")
data["MasVnrArea"] = data["MasVnrArea"].fillna(0)
nans=pd.isnull(data).sum()
nans[nans>0]
Out[36]:
SalePrice    1459
dtype: int64
In [37]:
_list = []
for col in data.columns:
    if type(data[col][0]) == type('str'):
        _list.append(col)

le = preprocessing.LabelEncoder()
for li in _list:
    le.fit(list(set(data[li])))
    data[li] = le.transform(data[li])

train, test = data[:len(train)], data[len(train):]

X = train.drop(columns=['SalePrice', 'Id'])
y = train['SalePrice']

test = test.drop(columns=['SalePrice', 'Id'])

Model and Accuracy

In [38]:
kfold = KFold(n_splits=5, random_state = 2020, shuffle = True)

model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)
model_lgb.fit(X, y)
r2_score(model_lgb.predict(X), y)
Out[38]:
0.9687841828399498

XGBoost

XGBoost is a decision-tree-based ensemble Machine Learning algorithm that uses a gradient boosting framework. In prediction problems involving unstructured data (images, text, etc.) artificial neural networks tend to outperform all other algorithms or frameworks.It is a perfect combination of software and hardware optimization techniques to yield superior results using less computing resources in the shortest amount of time.

Library and Data

In [39]:
import xgboost as xgb
#Data is used the same as LGB
X = train.drop(columns=['SalePrice', 'Id'])
y = train['SalePrice']
X.head()
Out[39]:
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig ... ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
0 60 3 65.0 8450 1 1 3 3 0 4 ... 0 0 3 4 1 0 2 2008 8 4
1 20 3 80.0 9600 1 1 3 3 0 2 ... 0 0 3 4 1 0 5 2007 8 4
2 60 3 68.0 11250 1 1 0 3 0 4 ... 0 0 3 4 1 0 9 2008 8 4
3 70 3 60.0 9550 1 1 0 3 0 0 ... 0 0 3 4 1 0 2 2006 8 0
4 60 3 84.0 14260 1 1 0 3 0 2 ... 0 0 3 4 1 0 12 2008 8 4

5 rows × 79 columns

Model and Accuracy

In [40]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468,
                             learning_rate=0.05, max_depth=3,
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
model_xgb.fit(X, y)
r2_score(model_xgb.predict(X), y)
Out[40]:
0.9979187668877296

Catboost

Catboost is a type of gradient boosting algorithms which can automatically deal with categorical variables without showing the type conversion error, which helps you to focus on tuning your model better rather than sorting out trivial errors.Make sure you handle missing data well before you proceed with the implementation.

Library and Data

In [41]:
from catboost import CatBoostRegressor
#Data is used the same as LGB
X = train.drop(columns=['SalePrice', 'Id'])
y = train['SalePrice']
X.head()
Out[41]:
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig ... ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
0 60 3 65.0 8450 1 1 3 3 0 4 ... 0 0 3 4 1 0 2 2008 8 4
1 20 3 80.0 9600 1 1 3 3 0 2 ... 0 0 3 4 1 0 5 2007 8 4
2 60 3 68.0 11250 1 1 0 3 0 4 ... 0 0 3 4 1 0 9 2008 8 4
3 70 3 60.0 9550 1 1 0 3 0 0 ... 0 0 3 4 1 0 2 2006 8 0
4 60 3 84.0 14260 1 1 0 3 0 2 ... 0 0 3 4 1 0 12 2008 8 4

5 rows × 79 columns

Model and Accuracy

In [42]:
cb_model = CatBoostRegressor(iterations=500,
                             learning_rate=0.05,
                             depth=10,
                             random_seed = 42,
                             bagging_temperature = 0.2,
                             od_type='Iter',
                             metric_period = 50,
                             od_wait=20)
cb_model.fit(X, y)
r2_score(cb_model.predict(X), y)
0:	learn: 76972.0985628	total: 95.5ms	remaining: 47.6s
50:	learn: 26415.2641368	total: 2.31s	remaining: 20.3s
100:	learn: 16709.9071671	total: 4.52s	remaining: 17.8s
150:	learn: 13122.2560081	total: 6.81s	remaining: 15.7s
200:	learn: 10679.6814896	total: 9.07s	remaining: 13.5s
250:	learn: 9032.4332082	total: 11.4s	remaining: 11.3s
300:	learn: 7635.2922883	total: 13.7s	remaining: 9.04s
350:	learn: 6447.9765142	total: 16.1s	remaining: 6.83s
400:	learn: 5348.0870745	total: 18.4s	remaining: 4.55s
450:	learn: 4444.7443814	total: 20.8s	remaining: 2.26s
499:	learn: 3768.1753183	total: 23.2s	remaining: 0us
Out[42]:
0.9977125026795101

Stochastic Gradient Descent

Stochastic means random , so in Stochastic Gradient Descent dataset sample is choosedn random instead of the whole dataset.hough, using the whole dataset is really useful for getting to the minima in a less noisy or less random manner, but the problem arises when our datasets get really huge and for that SGD come in action

Library and Data

In [43]:
from sklearn.linear_model import SGDRegressor
#Data is used the same as LGB
X = train.drop(columns=['SalePrice', 'Id'])
y = train['SalePrice']
X.head()
Out[43]:
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig ... ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
0 60 3 65.0 8450 1 1 3 3 0 4 ... 0 0 3 4 1 0 2 2008 8 4
1 20 3 80.0 9600 1 1 3 3 0 2 ... 0 0 3 4 1 0 5 2007 8 4
2 60 3 68.0 11250 1 1 0 3 0 4 ... 0 0 3 4 1 0 9 2008 8 4
3 70 3 60.0 9550 1 1 0 3 0 0 ... 0 0 3 4 1 0 2 2006 8 0
4 60 3 84.0 14260 1 1 0 3 0 2 ... 0 0 3 4 1 0 12 2008 8 4

5 rows × 79 columns

Model and Accuracy

In [44]:
SGD = SGDRegressor(max_iter = 100)
SGD.fit(X, y)
r2_score(SGD.predict(X), y)
Out[44]:
-0.40444935116294056

Lasso

In statistics and machine learning, lasso (least absolute shrinkage and selection operator; also Lasso or LASSO) is a regression analysis method that performs both variable selection and regularization in order to enhance the prediction accuracy and interpretability of the statistical model it produces. Though originally defined for least squares, lasso regularization is easily extended to a wide variety of statistical models including generalized linear models, generalized estimating equations, proportional hazards models, and M-estimators, in a straightforward fashion

Library and Data

In [45]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
#Data is used the same as LGB
X = train.drop(columns=['SalePrice', 'Id'])
y = train['SalePrice']
X.head()
Out[45]:
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig ... ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
0 60 3 65.0 8450 1 1 3 3 0 4 ... 0 0 3 4 1 0 2 2008 8 4
1 20 3 80.0 9600 1 1 3 3 0 2 ... 0 0 3 4 1 0 5 2007 8 4
2 60 3 68.0 11250 1 1 0 3 0 4 ... 0 0 3 4 1 0 9 2008 8 4
3 70 3 60.0 9550 1 1 0 3 0 0 ... 0 0 3 4 1 0 2 2006 8 0
4 60 3 84.0 14260 1 1 0 3 0 2 ... 0 0 3 4 1 0 12 2008 8 4

5 rows × 79 columns

Model and Accuracy

In [46]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
lasso.fit(X, y)
r2_score(lasso.predict(X), y)
Out[46]:
0.8293764275536554

Ridge Classifier CV

Library and Data

In [47]:
from sklearn.linear_model import RidgeClassifierCV
#Data is used the same as LGB
X = train.drop(columns=['SalePrice', 'Id'])
y = train['SalePrice']
X.head()
Out[47]:
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig ... ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
0 60 3 65.0 8450 1 1 3 3 0 4 ... 0 0 3 4 1 0 2 2008 8 4
1 20 3 80.0 9600 1 1 3 3 0 2 ... 0 0 3 4 1 0 5 2007 8 4
2 60 3 68.0 11250 1 1 0 3 0 4 ... 0 0 3 4 1 0 9 2008 8 4
3 70 3 60.0 9550 1 1 0 3 0 0 ... 0 0 3 4 1 0 2 2006 8 0
4 60 3 84.0 14260 1 1 0 3 0 2 ... 0 0 3 4 1 0 12 2008 8 4

5 rows × 79 columns

Model and Accuracy

In [48]:
rcc = RidgeClassifierCV()
rcc.fit(X, y)
r2_score(rcc.predict(X), y)
Out[48]:
0.6662347422006982

Kernel Ridge Regression

KRR combine Ridge regression and classification with the kernel trick.It is similar to Support vector Regression but relatively very fast.This is suitable for smaller dataset (less than 100 samples)

Library and Data

In [49]:
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
#Data is used the same as LGB
X = train.drop(columns=['SalePrice', 'Id'])
y = train['SalePrice']
X.head()
Out[49]:
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig ... ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
0 60 3 65.0 8450 1 1 3 3 0 4 ... 0 0 3 4 1 0 2 2008 8 4
1 20 3 80.0 9600 1 1 3 3 0 2 ... 0 0 3 4 1 0 5 2007 8 4
2 60 3 68.0 11250 1 1 0 3 0 4 ... 0 0 3 4 1 0 9 2008 8 4
3 70 3 60.0 9550 1 1 0 3 0 0 ... 0 0 3 4 1 0 2 2006 8 0
4 60 3 84.0 14260 1 1 0 3 0 2 ... 0 0 3 4 1 0 12 2008 8 4

5 rows × 79 columns

Model and Accuracy

In [50]:
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
KRR.fit(X, y)
r2_score(KRR.predict(X), y)
Out[50]:
0.9840030592828615

BayesianRidge

Bayesian regression, is a regression model defined in probabilistic terms, with explicit priors on the parameters. The choice of priors can have the regularizing effect.Bayesian approach is a general way of defining and estimating statistical models that can be applied to different models.

Library and Data

In [51]:
from sklearn.linear_model  import BayesianRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
#Data is used the same as LGB
X = train.drop(columns=['SalePrice', 'Id'])
y = train['SalePrice']
X.head()
Out[51]:
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig ... ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
0 60 3 65.0 8450 1 1 3 3 0 4 ... 0 0 3 4 1 0 2 2008 8 4
1 20 3 80.0 9600 1 1 3 3 0 2 ... 0 0 3 4 1 0 5 2007 8 4
2 60 3 68.0 11250 1 1 0 3 0 4 ... 0 0 3 4 1 0 9 2008 8 4
3 70 3 60.0 9550 1 1 0 3 0 0 ... 0 0 3 4 1 0 2 2006 8 0
4 60 3 84.0 14260 1 1 0 3 0 2 ... 0 0 3 4 1 0 12 2008 8 4

5 rows × 79 columns

Model and Accuracy

In [52]:
BR = BayesianRidge()
BR.fit(X, y)
r2_score(BR.predict(X), y)
Out[52]:
0.8167560815333851

Elastic Net Regression

Elastic net is a hybrid of ridge regression and lasso regularization.It combines feature elimination from Lasso and feature coefficient reduction from the Ridge model to improve your model's predictions.

Library and Data

In [53]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
#Data is used the same as LGB
X = train.drop(columns=['SalePrice', 'Id'])
y = train['SalePrice']
X.head()
Out[53]:
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig ... ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
0 60 3 65.0 8450 1 1 3 3 0 4 ... 0 0 3 4 1 0 2 2008 8 4
1 20 3 80.0 9600 1 1 3 3 0 2 ... 0 0 3 4 1 0 5 2007 8 4
2 60 3 68.0 11250 1 1 0 3 0 4 ... 0 0 3 4 1 0 9 2008 8 4
3 70 3 60.0 9550 1 1 0 3 0 0 ... 0 0 3 4 1 0 2 2006 8 0
4 60 3 84.0 14260 1 1 0 3 0 2 ... 0 0 3 4 1 0 12 2008 8 4

5 rows × 79 columns

Model and Accuracy

In [54]:
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
ENet.fit(X, y)
r2_score(ENet.predict(X), y)
Out[54]:
0.82933295622003

LDA

A classifier with a linear decision boundary, generated by fitting class conditional densities to the data and using Bayes’ rule.The model fits a Gaussian density to each class, assuming that all classes share the same covariance matrix.Itis used in statistics, pattern recognition, and machine learning to find a linear combination of features that characterizes or separates two or more classes of objects or events. The resulting combination may be used as a linear classifier, or, more commonly, for dimensionality reduction before later classification.

Library and Data

In [55]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = data
lda.head()
Out[55]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
0 1 60 3 65.0 8450 1 1 3 3 0 ... 0 3 4 1 0 2 2008 8 4 208500.0
1 2 20 3 80.0 9600 1 1 3 3 0 ... 0 3 4 1 0 5 2007 8 4 181500.0
2 3 60 3 68.0 11250 1 1 0 3 0 ... 0 3 4 1 0 9 2008 8 4 223500.0
3 4 70 3 60.0 9550 1 1 0 3 0 ... 0 3 4 1 0 2 2006 8 0 140000.0
4 5 60 3 84.0 14260 1 1 0 3 0 ... 0 3 4 1 0 12 2008 8 4 250000.0

5 rows × 81 columns

Model and Accuracy

In [56]:
X = lda.iloc[:, [2,3]].values
y = lda.iloc[:, 4].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
Model=LinearDiscriminantAnalysis()
Model.fit(X_train,y_train)
y_pred=Model.predict(X_test)
print('accuracy is ',accuracy_score(y_pred,y_test))
accuracy is  0.0273972602739726

K-Means Algorithm

K-means clustering is a type of unsupervised learning, which is used when you have unlabeled data and the goal of this algorithm is to find groups in the data

Steps to use this algorithm:

  • 1-Clusters the data into k groups where k is predefined.
  • 2-Select k points at random as cluster centers.
  • 3-Assign objects to their closest cluster center according to the Euclidean distance function.
  • 4-Calculate the centroid or mean of all objects in each cluster.

Examples: Behavioral segmentation like segment by purchase history or by activities on application, website, or platform Separate valid activity groups from bots

Libraries and Data

In [57]:
from sklearn.cluster import KMeans
km = pd.read_csv("k-mean/km.csv")
km.head()
Out[57]:
id latitude longitude
0 1 37.460459 126.440680
1 1 37.478832 126.668558
2 2 37.562143 126.801884
3 2 37.567454 127.005627
4 3 37.460459 126.440680

Checking for number of clusters

In [58]:
K_clusters = range(1,8)
kmeans = [KMeans(n_clusters=i) for i in K_clusters]
Y_axis = km[['latitude']]
X_axis = km[['longitude']]
# score = [kmeans[i].fit(Y_axis).score(Y_axis) for i in range(len(kmeans))]
# plt.plot(K_clusters, score)
# plt.xlabel('Number of Clusters')
# plt.ylabel('Score')
# plt.show()

Fitting Model

In [59]:
kmeans = KMeans(n_clusters = 3, init ='k-means++')
kmeans.fit(km[km.columns[1:3]])
km['cluster_label'] = kmeans.fit_predict(km[km.columns[1:3]])
centers = kmeans.cluster_centers_
labels = kmeans.predict(km[km.columns[1:3]])
km.cluster_label.unique()
Out[59]:
array([1, 0, 2], dtype=int64)

Plotting Clusters

In [60]:
km.plot.scatter(x = 'latitude', y = 'longitude', c=labels, s=50, cmap='viridis')
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=100, alpha=0.5)
Out[60]:
<matplotlib.collections.PathCollection at 0x264bbdf75c8>

CNN

Library and Data

In [61]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D
import tensorflow as tf
train_data = pd.read_csv("digit-recognizer/train.csv")
test_data = pd.read_csv("digit-recognizer/test.csv")
train_data.head()
Using TensorFlow backend.
Out[61]:
label pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8 ... pixel774 pixel775 pixel776 pixel777 pixel778 pixel779 pixel780 pixel781 pixel782 pixel783
0 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 4 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 785 columns

Preprocessing and Data Split

In [62]:
X = np.array(train_data.drop("label", axis=1)).astype('float32')
y = np.array(train_data['label']).astype('float32')
for i in range(9):
    plt.subplot(3,3,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(X[i].reshape(28, 28), cmap=plt.cm.binary)
    plt.xlabel(y[i])
plt.show()

X = X / 255.0
X = X.reshape(-1, 28, 28, 1)
y = to_categorical(y)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
X_test = np.array(test_data).astype('float32')
X_test = X_test / 255.0
X_test = X_test.reshape(-1, 28, 28, 1)
plt.figure(figsize=(10,10))
Out[62]:
<Figure size 720x720 with 0 Axes>
<Figure size 720x720 with 0 Axes>

Model

In [63]:
model = Sequential()
model.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same',
                 activation ='relu', input_shape = (28,28,1)))
model.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same',
                 activation ='relu'))
model.add(MaxPool2D(pool_size=(2,2)))
model.add(Dropout(0.25))
model.add(Conv2D(filters = 64, kernel_size = (3,3),padding = 'Same',
                 activation ='relu'))
model.add(Conv2D(filters = 64, kernel_size = (3,3),padding = 'Same',
                 activation ='relu'))
model.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(256, activation = "relu"))
model.add(Dropout(0.5))
model.add(Dense(10, activation = "softmax"))
model.summary()
from tensorflow.keras.utils import plot_model
plot_model(model, to_file='model1.png')
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
conv2d_1 (Conv2D)            (None, 28, 28, 32)        832       
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 28, 28, 32)        25632     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 14, 14, 32)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 14, 14, 32)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 14, 14, 64)        18496     
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 14, 14, 64)        36928     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 7, 7, 64)          0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 7, 7, 64)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 3136)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               803072    
_________________________________________________________________
dropout_3 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                2570      
=================================================================
Total params: 887,530
Trainable params: 887,530
Non-trainable params: 0
_________________________________________________________________
Failed to import pydot. You must install pydot and graphviz for `pydotprint` to work.

Compiling Model

In [64]:
#increse to epochs to 30 for better accuracy
model.compile(optimizer='adam', loss="categorical_crossentropy", metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=10, batch_size=85, validation_data=(X_val, y_val))
Train on 33600 samples, validate on 8400 samples
Epoch 1/10
33600/33600 [==============================] - 116s 3ms/step - loss: 0.2759 - accuracy: 0.9117 - val_loss: 0.0536 - val_accuracy: 0.9829
Epoch 2/10
33600/33600 [==============================] - 115s 3ms/step - loss: 0.0831 - accuracy: 0.9754 - val_loss: 0.0416 - val_accuracy: 0.9887
Epoch 3/10
33600/33600 [==============================] - 116s 3ms/step - loss: 0.0616 - accuracy: 0.9812 - val_loss: 0.0379 - val_accuracy: 0.9883
Epoch 4/10
33600/33600 [==============================] - 116s 3ms/step - loss: 0.0533 - accuracy: 0.9844 - val_loss: 0.0331 - val_accuracy: 0.9898
Epoch 5/10
33600/33600 [==============================] - 116s 3ms/step - loss: 0.0445 - accuracy: 0.9860 - val_loss: 0.0305 - val_accuracy: 0.9904
Epoch 6/10
33600/33600 [==============================] - 116s 3ms/step - loss: 0.0350 - accuracy: 0.9883 - val_loss: 0.0339 - val_accuracy: 0.9906
Epoch 7/10
33600/33600 [==============================] - 116s 3ms/step - loss: 0.0338 - accuracy: 0.9891 - val_loss: 0.0301 - val_accuracy: 0.9925
Epoch 8/10
33600/33600 [==============================] - 116s 3ms/step - loss: 0.0318 - accuracy: 0.9899 - val_loss: 0.0279 - val_accuracy: 0.9925
Epoch 9/10
33600/33600 [==============================] - 116s 3ms/step - loss: 0.0264 - accuracy: 0.9917 - val_loss: 0.0281 - val_accuracy: 0.9923
Epoch 10/10
33600/33600 [==============================] - 116s 3ms/step - loss: 0.0261 - accuracy: 0.9916 - val_loss: 0.0265 - val_accuracy: 0.9933
In [65]:
accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']
epochs = range(len(accuracy))
plt.plot(epochs, accuracy, 'bo', label='Training accuracy')
plt.plot(epochs, val_accuracy, 'b', label='Validation accuracy')
plt.show()

print(model.evaluate(X_val, y_val))
8400/8400 [==============================] - 12s 1ms/step - ETA: 11s
[0.026516210664710906, 0.9933333396911621]

LSTM

LSTM blocks are part of a recurrent neural network structure. Recurrent neural networks are made to utilize certain types of artificial memory processes that can help these artificial intelligence programs to more effectively imitate human thought.It is capable of learning order dependence LSTM can be used for machine translation, speech recognition, and more.

Library and Data

In [66]:
import math
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM
lstm = pd.read_csv("nyse/prices.csv")
lstm = lstm[lstm['symbol']=="NFLX"]
lstm['date'] = pd.to_datetime(lstm['date'])
lstm.set_index('date',inplace=True)
lstm = lstm.reset_index()
lstm.head()
Out[66]:
date symbol open close low high volume
0 2010-01-04 NFLX 55.519999 53.479999 52.960001 55.730000 17239600.0
1 2010-01-05 NFLX 53.570001 51.510001 50.810001 53.599998 23753100.0
2 2010-01-06 NFLX 51.530001 53.319999 50.380002 53.710001 23290400.0
3 2010-01-07 NFLX 54.120000 52.400001 52.240001 54.300001 9955400.0
4 2010-01-08 NFLX 52.490000 53.300002 52.260001 54.199999 8180900.0

Preprocessing

In [67]:
data = lstm.filter(['close'])
dataset = data.values
training_data_len = math.ceil(len(dataset)*.75)
scaler = MinMaxScaler(feature_range=(0,1))
scaled_data = scaler.fit_transform(dataset)
train_data = scaled_data[0:training_data_len, :]
x_train = []
y_train = []
for i in range(60,len(train_data)):
    x_train.append(train_data[i-60:i, 0])
    y_train.append(train_data[i,0])
x_train,y_train = np.array(x_train), np.array(y_train)
x_train = np.reshape(x_train,(x_train.shape[0],x_train.shape[1],1))

Model

In [68]:
model =Sequential()
model.add(LSTM(64,return_sequences=True, input_shape=(x_train.shape[1],1)))
model.add(LSTM(64, return_sequences= False))
model.add(Dense(32))
model.add(Dense(1))
model.summary()
from tensorflow.keras.utils import plot_model
plot_model(model, to_file='model1.png')
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
lstm_1 (LSTM)                (None, 60, 64)            16896     
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dense_3 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 33        
=================================================================
Total params: 52,033
Trainable params: 52,033
Non-trainable params: 0
_________________________________________________________________
Failed to import pydot. You must install pydot and graphviz for `pydotprint` to work.

Compiling Model

In [69]:
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(x_train,y_train, batch_size=85, epochs=20)
Epoch 1/20
1262/1262 [==============================] - 6s 5ms/step - loss: 0.0298
Epoch 2/20
1262/1262 [==============================] - 3s 3ms/step - loss: 0.0052
Epoch 3/20
1262/1262 [==============================] - 3s 3ms/step - loss: 0.0013
Epoch 4/20
1262/1262 [==============================] - 3s 3ms/step - loss: 9.3545e-04
Epoch 5/20
1262/1262 [==============================] - 3s 3ms/step - loss: 7.9703e-04
Epoch 6/20
1262/1262 [==============================] - 3s 3ms/step - loss: 7.7535e-04
Epoch 7/20
1262/1262 [==============================] - 3s 3ms/step - loss: 7.5249e-04
Epoch 8/20
1262/1262 [==============================] - 3s 3ms/step - loss: 7.3337e-04
Epoch 9/20
1262/1262 [==============================] - 3s 3ms/step - loss: 7.1946e-04
Epoch 10/20
1262/1262 [==============================] - 3s 3ms/step - loss: 7.0998e-04
Epoch 11/20
1262/1262 [==============================] - 3s 3ms/step - loss: 6.7394e-04
Epoch 12/20
1262/1262 [==============================] - 4s 3ms/step - loss: 6.5635e-04
Epoch 13/20
1262/1262 [==============================] - 3s 3ms/step - loss: 6.3674e-04
Epoch 14/20
1262/1262 [==============================] - 3s 3ms/step - loss: 6.4215e-04
Epoch 15/20
1262/1262 [==============================] - 4s 3ms/step - loss: 6.4284e-04
Epoch 16/20
1262/1262 [==============================] - 3s 3ms/step - loss: 5.9654e-04
Epoch 17/20
1262/1262 [==============================] - 4s 3ms/step - loss: 5.7134e-04
Epoch 18/20
1262/1262 [==============================] - 4s 3ms/step - loss: 5.5673e-04
Epoch 19/20
1262/1262 [==============================] - 3s 2ms/step - loss: 5.6984e-04
Epoch 20/20
1262/1262 [==============================] - 3s 2ms/step - loss: 5.7418e-04
Out[69]:
<keras.callbacks.callbacks.History at 0x264a3c64e08>

Prediction and Accuracy

In [70]:
test_data= scaled_data[training_data_len-60:, :]
x_test = []
y_test = dataset[training_data_len:,:]
for i in range(60,len(test_data)):
    x_test.append(test_data[i-60:i,0])
x_test = np.array(x_test)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1],1))
predictions = model.predict(x_test)
predictions = scaler.inverse_transform(predictions)
rmse = np.sqrt(np.mean(predictions - y_test)**2)
rmse
Out[70]:
3.911900236180531

Principle Component Analysis

It's an important method for dimension reduction.It extracts low dimensional set of features from a high dimensional data set with a motive to capture as much information as possible and to visualise high-dimensional data, it also reduces noise and finally makes other algorithms to work better because we are injecting fewer inputs.

Example: When we have to bring out strong patterns in a data set or to make data easy to explore and visualize

In [71]:
from sklearn.datasets import make_blobs
from sklearn import datasets
class PCA:
  def __init__(self, n_components):
    self.n_components = n_components
    self.components = None
    self.mean = None

  def fit(self, X):
    self.mean = np.mean(X, axis=0)
    X = X - self.mean
    cov = np.cov(X.T)

    evalue, evector = np.linalg.eig(cov)

    eigenvectors = evector.T
    idxs = np.argsort(evalue)[::-1]

    evalue = evalue[idxs]
    evector = evector[idxs]
    self.components = evector[0:self.n_components]

  def transform(self, X):
    #project data
    X = X - self.mean
    return(np.dot(X, self.components.T))

data = datasets.load_iris()
X = data.data
y = data.target

pca = PCA(2)
pca.fit(X)
X_projected = pca.transform(X)



x1 = X_projected[:,0]
x2 = X_projected[:,1]

plt.scatter(x1,x2,c=y,edgecolor='none',alpha=0.8,cmap=plt.cm.get_cmap('viridis',3))
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar()
plt.show()

Apriori

It is a categorisation algorithm attempts to operate on database records, particularly transactional records, or records including certain numbers of fields or items.It is mainly used for sorting large amounts of data. Sorting data often occurs because of association rules.

Example: To analyse data for frequent if/then patterns and using the criteria support and confidence to identify the most important relationships.

In [72]:
df = pd.read_csv('supermarket/GroceryStoreDataSet.csv',names=['products'],header=None)
data = list(df["products"].apply(lambda x:x.split(',')))
data
Out[72]:
[['MILK', 'BREAD', 'BISCUIT'],
 ['BREAD', 'MILK', 'BISCUIT', 'CORNFLAKES'],
 ['BREAD', 'TEA', 'BOURNVITA'],
 ['JAM', 'MAGGI', 'BREAD', 'MILK'],
 ['MAGGI', 'TEA', 'BISCUIT'],
 ['BREAD', 'TEA', 'BOURNVITA'],
 ['MAGGI', 'TEA', 'CORNFLAKES'],
 ['MAGGI', 'BREAD', 'TEA', 'BISCUIT'],
 ['JAM', 'MAGGI', 'BREAD', 'TEA'],
 ['BREAD', 'MILK'],
 ['COFFEE', 'COCK', 'BISCUIT', 'CORNFLAKES'],
 ['COFFEE', 'COCK', 'BISCUIT', 'CORNFLAKES'],
 ['COFFEE', 'SUGER', 'BOURNVITA'],
 ['BREAD', 'COFFEE', 'COCK'],
 ['BREAD', 'SUGER', 'BISCUIT'],
 ['COFFEE', 'SUGER', 'CORNFLAKES'],
 ['BREAD', 'SUGER', 'BOURNVITA'],
 ['BREAD', 'COFFEE', 'SUGER'],
 ['BREAD', 'COFFEE', 'SUGER'],
 ['TEA', 'MILK', 'COFFEE', 'CORNFLAKES']]
In [74]:
from mlxtend.frequent_patterns import apriori
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
te_data = te.fit(data).transform(data)
df = pd.DataFrame(te_data,columns=te.columns_)
df1 = apriori(df,min_support=0.01,use_colnames=True)
df1.head()
Out[74]:
support itemsets
0 0.35 (BISCUIT)
1 0.20 (BOURNVITA)
2 0.65 (BREAD)
3 0.15 (COCK)
4 0.40 (COFFEE)

Prophet

Prophet is an extremely easy tool for analysts to produce reliable forecasts

  1. Prophet only takes data as a dataframe with a ds (datestamp) and y (value we want to forecast) column. So first, let’s convert the dataframe to the appropriate format.
  2. Create an instance of the Prophet class and then fit our dataframe to it.
  3. Create a dataframe with the dates for which we want a prediction to be made with make_future_dataframe(). Then specify the number of days to forecast using the periods parameter.
  4. Call predict to make a prediction and store it in the forecast dataframe. What’s neat here is that you can inspect the dataframe and see the predictions as well as the lower and upper boundaries of the uncertainty interval.

Library and Data

In [78]:
import plotly.offline as py
import plotly.express as px
from fbprophet import Prophet
from fbprophet.plot import plot_plotly, add_changepoints_to_plot

pred = pd.read_csv("coronavirus-2019ncov/covid-19-all.csv")
pred = pred.fillna(0)
predgrp = pred.groupby("Date")[["Confirmed","Recovered","Deaths"]].sum().reset_index()
pred_cnfrm = predgrp.loc[:,["Date","Confirmed"]]
pr_data = pred_cnfrm
pr_data.columns = ['ds','y']
pr_data.head()
Out[78]:
ds y
0 2020-01-22 555.0
1 2020-01-23 653.0
2 2020-01-24 941.0
3 2020-01-25 1438.0
4 2020-01-26 2118.0

Model and Forecast

In [79]:
m=Prophet()
m.fit(pr_data)
future=m.make_future_dataframe(periods=15)
forecast=m.predict(future)
forecast
INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
Out[79]:
ds trend yhat_lower yhat_upper trend_lower trend_upper additive_terms additive_terms_lower additive_terms_upper weekly weekly_lower weekly_upper multiplicative_terms multiplicative_terms_lower multiplicative_terms_upper yhat
0 2020-01-22 -6.740434e+02 -1.353089e+04 2.651731e+03 -6.740434e+02 -6.740434e+02 -4432.675842 -4432.675842 -4432.675842 -4432.675842 -4432.675842 -4432.675842 0.0 0.0 0.0 -5.106719e+03
1 2020-01-23 -8.110030e+01 -9.637391e+03 6.393800e+03 -8.110030e+01 -8.110030e+01 -1501.388118 -1501.388118 -1501.388118 -1501.388118 -1501.388118 -1501.388118 0.0 0.0 0.0 -1.582488e+03
2 2020-01-24 5.118427e+02 -3.219560e+03 1.243235e+04 5.118427e+02 5.118427e+02 4369.882653 4369.882653 4369.882653 4369.882653 4369.882653 4369.882653 0.0 0.0 0.0 4.881725e+03
3 2020-01-25 1.104786e+03 -2.751037e+03 1.357847e+04 1.104786e+03 1.104786e+03 4373.665431 4373.665431 4373.665431 4373.665431 4373.665431 4373.665431 0.0 0.0 0.0 5.478451e+03
4 2020-01-26 2.671462e+03 -2.595618e+03 1.220225e+04 2.671462e+03 2.671462e+03 2320.327234 2320.327234 2320.327234 2320.327234 2320.327234 2320.327234 0.0 0.0 0.0 4.991790e+03
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
118 2020-05-19 4.796129e+06 4.739947e+06 4.855339e+06 4.746662e+06 4.861540e+06 -4402.027356 -4402.027356 -4402.027356 -4402.027356 -4402.027356 -4402.027356 0.0 0.0 0.0 4.791727e+06
119 2020-05-20 4.876179e+06 4.814107e+06 4.944349e+06 4.819936e+06 4.950845e+06 -4432.675842 -4432.675842 -4432.675842 -4432.675842 -4432.675842 -4432.675842 0.0 0.0 0.0 4.871747e+06
120 2020-05-21 4.956229e+06 4.892332e+06 5.039016e+06 4.892599e+06 5.038904e+06 -1501.388118 -1501.388118 -1501.388118 -1501.388118 -1501.388118 -1501.388118 0.0 0.0 0.0 4.954728e+06
121 2020-05-22 5.036280e+06 4.968077e+06 5.132247e+06 4.962351e+06 5.126969e+06 4369.882653 4369.882653 4369.882653 4369.882653 4369.882653 4369.882653 0.0 0.0 0.0 5.040649e+06
122 2020-05-23 5.116330e+06 5.037084e+06 5.221758e+06 5.032920e+06 5.215227e+06 4373.665431 4373.665431 4373.665431 4373.665431 4373.665431 4373.665431 0.0 0.0 0.0 5.120703e+06

123 rows × 16 columns

In [80]:
fig = plot_plotly(m, forecast)
py.iplot(fig)

fig = m.plot(forecast,xlabel='Date',ylabel='Confirmed Count')

Arima

Library and Data

In [81]:
import datetime
from statsmodels.tsa.arima_model import ARIMA
ar = pd.read_csv("competitive-data-science-predict-future-sales/sales_train.csv")
ar.date=ar.date.apply(lambda x:datetime.datetime.strptime(x, '%d.%m.%Y'))
ar=ar.groupby(["date_block_num"])["item_cnt_day"].sum()
ar.index=pd.date_range(start = '2013-01-01',end='2015-10-01', freq = 'MS')
ar=ar.reset_index()
ar=ar.loc[:,["index","item_cnt_day"]]
ar.columns = ['confirmed_date','count']
ar.head()
Out[81]:
confirmed_date count
0 2013-01-01 131479.0
1 2013-02-01 128090.0
2 2013-03-01 147142.0
3 2013-04-01 107190.0
4 2013-05-01 106970.0

Model

In [82]:
model = ARIMA(ar['count'].values, order=(1, 2, 1))
fit_model = model.fit(trend='c', full_output=True, disp=True)
fit_model.summary()
Out[82]:
ARIMA Model Results
Dep. Variable: D2.y No. Observations: 32
Model: ARIMA(1, 2, 1) Log Likelihood -367.756
Method: css-mle S.D. of innovations 22262.761
Date: Wed, 13 May 2020 AIC 743.512
Time: 18:52:51 BIC 749.375
Sample: 2 HQIC 745.456
coef std err z P>|z| [0.025 0.975]
const -73.0641 329.028 -0.222 0.826 -717.948 571.819
ar.L1.D2.y -0.2606 0.168 -1.552 0.131 -0.590 0.068
ma.L1.D2.y -1.0000 0.084 -11.969 0.000 -1.164 -0.836
Roots
Real Imaginary Modulus Frequency
AR.1 -3.8366 +0.0000j 3.8366 0.5000
MA.1 1.0000 +0.0000j 1.0000 0.0000

Prediction

In [83]:
fit_model.plot_predict()
plt.title('Forecast vs Actual')
pd.DataFrame(fit_model.resid).plot()
forcast = fit_model.forecast(steps=6)
pred_y = forcast[0].tolist()
pred = pd.DataFrame(pred_y)